1. Loading libraries

To see which packages are used, please inspect the load_libraries and regression_metrics files.

source('load_libraries.R')
source("regression_metrics.R")

2. Loading + organising data

Via gist paths, the datasets are loaded in data.table format.

house_train <- fread("https://gist.githubusercontent.com/geraldwal/c15f40a258a15f6417e70705d57e9a21/raw/800519c3f4d633b8985cf9fc4b60ebccc89494eb/house_price_train")
house_test <- fread("https://gist.githubusercontent.com/geraldwal/58f48ae5a44f7061ced0c3486e2f07aa/raw/5a77070d797ffa9598c0eb6f37facb6a2a20c78b/house_price_test")

head(house_train)
           id      date  price bedrooms bathrooms sqft_living sqft_lot
1: 9183703376 5/13/2014 225000        3      1.50        1250     7500
2:  464000600 8/27/2014 641250        3      2.50        2220     2550
3: 2224079050 7/18/2014 810000        4      3.50        3980   209523
4: 6163901283 1/30/2015 330000        4      1.50        1890     7540
5: 6392003810 5/23/2014 530000        4      1.75        1814     5000
6: 7974200948 5/20/2014 953007        4      3.50        3120     5086
   floors waterfront view condition grade sqft_above sqft_basement
1:      1          0    0         3     7       1250             0
2:      3          0    2         3    10       2220             0
3:      2          0    2         3     9       3980             0
4:      1          0    0         4     7       1890             0
5:      1          0    0         4     7        944           870
6:      2          0    0         3     9       2480           640
   yr_built yr_renovated zipcode     lat     long sqft_living15 sqft_lot15
1:     1967            0   98030 47.3719 -122.215          1260       7563
2:     1990            0   98117 47.6963 -122.393          2200       5610
3:     2006            0   98024 47.5574 -121.890          2220      65775
4:     1967            0   98155 47.7534 -122.318          1890       8515
5:     1951            0   98115 47.6840 -122.281          1290       5000
6:     2008            0   98115 47.6762 -122.288          1880       5092
head(house_test)
           id       date bedrooms bathrooms sqft_living sqft_lot floors
1: 6414100192  12/9/2014        3      2.25        2570     7242    2.0
2: 6054650070  10/7/2014        3      1.75        1370     9680    1.0
3:   16000397  12/5/2014        2      1.00        1200     9850    1.0
4: 2524049179  8/26/2014        3      2.75        3050    44867    1.0
5: 8562750320 11/10/2014        3      2.50        2320     3980    2.0
6: 7589200193 11/10/2014        3      1.00        1090     3000    1.5
   waterfront view condition grade sqft_above sqft_basement yr_built
1:          0    0         3     7       2170           400     1951
2:          0    0         4     7       1370             0     1977
3:          0    0         4     7       1200             0     1921
4:          0    4         3     9       2330           720     1968
5:          0    0         3     8       2320             0     2003
6:          0    0         4     8       1090             0     1929
   yr_renovated zipcode     lat     long sqft_living15 sqft_lot15
1:         1991   98125 47.7210 -122.319          1690       7639
2:            0   98074 47.6127 -122.045          1370      10208
3:            0   98002 47.3089 -122.210          1060       5095
4:            0   98040 47.5316 -122.233          4110      20336
5:            0   98027 47.5391 -122.070          2580       3980
6:            0   98117 47.6889 -122.375          1570       5080

In this chunk, a split column is added in order to easily split and stack the datasets when needed.

house_test$split <- "test"
house_test$price <- 0
house_train$split <- "train"

house_data <- rbind(house_train, house_test)
str(house_data)
Classes 'data.table' and 'data.frame':  21597 obs. of  22 variables:
 $ id           :integer64 9183703376 464000600 2224079050 6163901283 6392003810 7974200948 2426059124 2115510300 ... 
 $ date         : chr  "5/13/2014" "8/27/2014" "7/18/2014" "1/30/2015" ...
 $ price        : num  225000 641250 810000 330000 530000 ...
 $ bedrooms     : int  3 3 4 4 4 4 4 3 4 3 ...
 $ bathrooms    : num  1.5 2.5 3.5 1.5 1.75 3.5 3.25 2.25 2.5 1.5 ...
 $ sqft_living  : int  1250 2220 3980 1890 1814 3120 4160 1440 2250 2540 ...
 $ sqft_lot     : int  7500 2550 209523 7540 5000 5086 47480 10500 6840 9520 ...
 $ floors       : num  1 3 2 1 1 2 2 1 2 1 ...
 $ waterfront   : int  0 0 0 0 0 0 0 0 0 0 ...
 $ view         : int  0 2 2 0 0 0 0 0 0 0 ...
 $ condition    : int  3 3 3 4 4 3 3 3 3 3 ...
 $ grade        : int  7 10 9 7 7 9 10 8 9 8 ...
 $ sqft_above   : int  1250 2220 3980 1890 944 2480 4160 1130 2250 1500 ...
 $ sqft_basement: int  0 0 0 0 870 640 0 310 0 1040 ...
 $ yr_built     : int  1967 1990 2006 1967 1951 2008 1995 1983 1987 1959 ...
 $ yr_renovated : int  0 0 0 0 0 0 0 0 0 0 ...
 $ zipcode      : int  98030 98117 98024 98155 98115 98115 98072 98023 98058 98115 ...
 $ lat          : num  47.4 47.7 47.6 47.8 47.7 ...
 $ long         : num  -122 -122 -122 -122 -122 ...
 $ sqft_living15: int  1260 2200 2220 1890 1290 1880 3400 1510 2480 1870 ...
 $ sqft_lot15   : int  7563 5610 65775 8515 5000 5092 40428 8125 7386 6800 ...
 $ split        : chr  "train" "train" "train" "train" ...
 - attr(*, ".internal.selfref")=<externalptr> 
# Copy to continue working on
hd <- house_data

Only keep unique rows.

sum(duplicated(hd$id))
[1] 177
hd <- hd[!duplicated(hd$id), ]

hdTrain <- hd[which(hd$split == "train"),]
hdTest <- hd[which(hd$split == "test"),]

3. EDA

Analysis of the different variables, based on their individual distributions and relation according to the target variable price.

table(hd$floors)

    1   1.5     2   2.5     3   3.5 
10552  1888  8203   161   609     7 
unique(house_data$bedrooms)
 [1]  3  4  5  2  6  1  7  8  9 10 33 11
unique(house_data$zipcode)
 [1] 98030 98117 98024 98155 98115 98072 98023 98058 98056 98045 98028
[12] 98118 98144 98074 98108 98008 98027 98038 98107 98133 98166 98065
[23] 98042 98040 98022 98006 98033 98052 98034 98092 98112 98198 98178
[34] 98106 98168 98053 98014 98077 98103 98001 98029 98019 98031 98075
[45] 98059 98136 98039 98011 98119 98122 98126 98177 98003 98116 98105
[56] 98005 98032 98146 98109 98199 98055 98070 98007 98002 98125 98148
[67] 98010 98004 98188 98102
plot(table(house_train$bedrooms))

plot(table(house_train$yr_renovated))

plot(table(house_train$sqft_basement))

unique(house_data$long)
  [1] -122.215 -122.393 -121.890 -122.318 -122.281 -122.288 -122.115
  [8] -122.391 -122.151 -122.306 -122.182 -121.707 -122.244 -122.104
 [15] -122.266 -122.297 -122.174 -122.051 -122.295 -122.301 -122.396
 [22] -122.105 -121.774 -122.230 -122.092 -122.041 -122.367 -122.350
 [29] -122.339 -122.122 -121.879 -122.119 -122.022 -122.128 -122.316
 [36] -122.213 -122.027 -122.114 -122.171 -122.103 -122.214 -122.087
 [43] -122.191 -121.880 -122.173 -122.309 -122.247 -122.290 -122.358
 [50] -121.966 -122.277 -122.210 -122.268 -122.074 -122.089 -122.190
 [57] -122.047 -122.017 -122.362 -122.274 -122.395 -121.920 -122.052
 [64] -122.223 -122.033 -122.167 -122.335 -122.161 -122.331 -122.152
 [71] -122.365 -122.340 -122.276 -122.018 -122.338 -122.349 -121.983
 [78] -122.222 -122.045 -121.994 -122.116 -122.291 -122.144 -122.176
 [85] -122.184 -122.389 -122.257 -122.226 -122.181 -122.313 -122.100
 [92] -122.206 -122.066 -122.400 -122.133 -122.278 -122.361 -122.299
 [99] -122.351 -122.056 -122.264 -122.200 -122.371 -122.279 -122.384
[106] -122.307 -122.140 -122.235 -122.319 -122.368 -122.189 -122.369
[113] -122.029 -121.951 -122.390 -122.370 -122.308 -121.973 -122.168
[120] -122.156 -122.330 -122.324 -122.280 -122.229 -121.984 -122.317
[127] -122.285 -122.379 -122.377 -122.157 -122.153 -122.164 -122.386
[134] -122.347 -122.064 -122.071 -122.134 -122.023 -122.388 -121.999
[141] -122.356 -122.013 -122.038 -122.253 -121.976 -122.021 -122.232
[148] -122.397 -122.172 -122.188 -122.108 -122.242 -122.387 -122.314
[155] -122.170 -122.372 -122.149 -122.272 -122.376 -122.135 -122.273
[162] -122.097 -122.342 -121.878 -122.286 -122.150 -122.030 -122.322
[169] -121.709 -122.165 -122.270 -122.085 -122.402 -122.121 -122.058
[176] -122.403 -121.871 -122.246 -122.267 -122.289 -122.073 -122.055
[183] -122.040 -121.969 -121.970 -122.007 -121.764 -122.109 -122.217
[190] -122.378 -122.192 -122.185 -122.298 -122.065 -122.256 -122.444
[197] -122.132 -122.227 -122.259 -122.175 -122.025 -122.009 -122.399
[204] -122.409 -122.112 -122.294 -122.383 -122.243 -122.196 -122.169
[211] -122.123 -122.237 -122.154 -122.332 -122.392 -122.300 -122.357
[218] -122.024 -122.304 -122.346 -122.094 -122.233 -122.062 -122.020
[225] -122.303 -122.099 -122.197 -122.359 -122.036 -122.083 -121.912
[232] -122.284 -121.748 -122.186 -122.320 -122.117 -122.158 -122.178
[239] -122.032 -122.068 -122.265 -121.967 -122.031 -122.310 -122.141
[246] -122.225 -121.883 -122.204 -121.996 -122.130 -121.841 -122.208
[253] -122.124 -122.328 -122.269 -121.961 -121.902 -122.260 -121.842
[260] -122.315 -122.275 -122.106 -122.003 -122.327 -121.982 -122.353
[267] -122.312 -122.287 -122.162 -122.126 -122.180 -122.203 -122.147
[274] -122.216 -122.166 -122.302 -122.160 -122.050 -122.321 -122.283
[281] -122.352 -122.375 -122.187 -122.385 -122.404 -121.772 -122.245
[288] -122.271 -122.136 -121.965 -122.049 -122.363 -122.205 -122.070
[295] -122.221 -122.212 -121.909 -122.113 -122.037 -122.120 -122.311
[302] -122.219 -122.125 -122.046 -122.412 -122.231 -122.195 -122.252
[309] -121.770 -122.373 -122.148 -122.201 -122.381 -121.869 -121.864
[316] -122.241 -122.209 -121.874 -121.778 -122.374 -121.876 -122.129
[323] -121.948 -122.366 -122.015 -121.921 -121.776 -122.194 -121.746
[330] -121.862 -122.043 -122.127 -122.014 -122.218 -122.111 -122.042
[337] -122.091 -122.394 -122.179 -121.988 -122.292 -122.445 -121.998
[344] -122.293 -122.080 -122.146 -122.001 -122.360 -122.341 -121.893
[351] -122.000 -122.410 -122.005 -122.282 -121.783 -122.077 -121.905
[358] -122.081 -122.345 -121.980 -122.061 -122.019 -122.482 -121.990
[365] -121.989 -121.979 -122.421 -122.355 -121.769 -122.337 -122.054
[372] -121.845 -122.348 -121.971 -122.110 -121.954 -122.193 -122.069
[379] -122.159 -122.059 -122.101 -122.248 -122.234 -122.236 -122.202
[386] -122.382 -121.908 -122.155 -121.992 -121.745 -122.072 -122.380
[393] -122.258 -122.067 -122.329 -122.326 -122.093 -121.829 -122.090
[400] -121.889 -121.759 -122.228 -121.773 -122.211 -122.039 -122.224
[407] -122.075 -122.012 -122.163 -121.915 -122.118 -122.139 -122.010
[414] -122.325 -122.364 -122.088 -121.997 -122.240 -122.263 -122.078
[421] -122.095 -122.199 -122.405 -122.344 -122.238 -122.239 -122.082
[428] -121.993 -121.763 -121.718 -122.323 -122.107 -122.334 -122.354
[435] -121.860 -122.142 -122.446 -122.183 -122.086 -122.016 -121.901
[442] -121.963 -122.198 -121.977 -122.262 -121.910 -121.756 -122.343
[449] -121.904 -122.026 -121.926 -122.137 -122.333 -122.220 -122.336
[456] -122.406 -121.788 -121.873 -121.870 -121.964 -122.177 -121.899
[463] -121.972 -121.858 -121.716 -121.945 -122.143 -122.063 -122.034
[470] -121.881 -122.296 -122.261 -122.138 -121.887 -121.944 -122.305
[477] -121.959 -122.251 -121.744 -121.801 -122.464 -121.995 -122.028
[484] -122.048 -122.131 -122.145 -121.855 -121.723 -122.044 -121.698
[491] -122.413 -121.789 -121.898 -121.930 -121.758 -121.903 -122.255
[498] -122.249 -121.958 -122.011 -121.986 -122.102 -122.004 -121.859
[505] -122.098 -121.922 -121.754 -121.962 -122.060 -121.956 -122.057
[512] -121.975 -121.866 -122.035 -121.868 -122.416 -122.084 -121.888
[519] -122.079 -121.940 -121.786 -122.006 -121.838 -122.076 -122.443
[526] -121.947 -122.053 -122.207 -121.885 -121.894 -121.768 -121.767
[533] -121.867 -122.438 -121.852 -121.946 -121.877 -121.853 -121.985
[540] -121.771 -121.780 -122.254 -122.002 -121.955 -121.856 -122.514
[547] -121.733 -122.456 -121.850 -121.747 -121.861 -121.918 -121.987
[554] -121.826 -122.096 -121.978 -121.981 -121.891 -121.968 -121.743
[561] -121.405 -122.250 -122.415 -122.407 -121.777 -121.882 -122.465
[568] -121.875 -121.321 -122.411 -121.913 -122.462 -121.911 -122.408
[575] -121.907 -122.398 -121.799 -122.474 -121.960 -121.991 -121.974
[582] -122.008 -121.815 -121.886 -122.401 -121.782 -122.496 -121.917
[589] -121.939 -121.364 -121.730 -121.827 -121.914 -121.315 -122.463
[596] -121.929 -121.775 -121.403 -121.726 -122.451 -121.935 -121.787
[603] -121.892 -122.453 -122.467 -121.719 -122.499 -121.932 -121.949
[610] -122.470 -121.851 -122.447 -121.755 -121.872 -121.900 -121.734
[617] -121.714 -122.472 -121.863 -121.802 -121.847 -121.781 -121.725
[624] -121.822 -121.691 -121.937 -121.800 -121.897 -122.507 -121.711
[631] -121.752 -122.449 -122.439 -121.784 -121.779 -121.953 -121.821
[638] -122.504 -121.795 -121.952 -121.857 -122.450 -121.757 -121.925
[645] -121.749 -121.957 -121.837 -121.884 -122.475 -121.736 -121.713
[652] -121.835 -121.896 -122.502 -122.422 -122.469 -121.316 -122.414
[659] -122.486 -122.505 -121.810 -122.519 -122.497 -121.738 -121.854
[666] -121.906 -121.724 -121.950 -122.454 -122.461 -121.765 -121.735
[673] -121.931 -121.402 -122.458 -122.431 -121.742 -122.455 -122.457
[680] -121.325 -121.927 -122.506 -121.846 -121.895 -121.943 -121.676
[687] -121.701 -121.833 -121.766 -122.484 -121.646 -121.916 -122.432
[694] -121.840 -121.941 -122.420 -121.785 -121.933 -122.515 -122.479
[701] -122.430 -121.924 -122.425 -121.823 -121.721 -121.848 -122.433
[708] -122.440 -122.435 -121.750 -121.727 -121.762 -121.809 -121.834
[715] -121.359 -121.934 -122.459 -121.480 -122.503 -122.490 -121.319
[722] -121.803 -121.417 -121.739 -122.473 -121.352 -121.828 -121.942
[729] -121.731 -121.760 -122.512 -121.708 -121.865 -121.761 -122.448
[736] -121.936 -121.820 -122.509 -121.473 -122.441 -122.511 -121.818
[743] -121.819 -122.460 -121.804 -122.452 -121.831 -121.737 -121.792
[750] -121.849 -121.797
par(mfrow=c(1, 4))  # divide graph area in 4 columns

# Box plot for 'price'
boxplot(hdTrain$price, main="Price", sub=paste("Outlier rows: ", boxplot.stats(hdTrain$price)$out))

# Box plot for 'lat'
boxplot(hdTrain$lat, main="lat", sub=paste("Outlier rows: ", boxplot.stats(hdTrain$lat)$out))

# Box plot for 'bedrooms'
boxplot(hdTrain$bedrooms, main="bedrooms", sub=paste("Outlier rows: ", boxplot.stats(hdTrain$bedrooms)$out))

# Box plot for 'bathrooms'
boxplot(hdTrain$bathrooms, main="bathrooms", sub=paste("Outlier rows: ", boxplot.stats(hdTrain$bathrooms)$out)) 

Advanced graphs

Showing the evolution of amount of houses over the years

hd %>%
group_by(yr_built) %>%
summarise(n = n()) %>%
ggplot(aes(x = yr_built, y = n)) +
geom_line(color = 'black') +
geom_point(color = 'blue', size = 0.5) +
geom_smooth(method="lm", color = 'green', size = 0.5) +
theme_linedraw() +
theme(plot.title = element_text(hjust = 0,color = 'black')) +
labs(x = 'year', y = 'count',
     title = "House sales in 1900 - 2015") +
scale_x_continuous(breaks=seq(1900, 2015, 10))

Interactive graph summarizing several variables related to price

plot_ly(y=~price,x=~bedrooms,data = hdTrain,
        hoverinfo = "text",text= ~paste("Price: ",price,"\n","Bathrooms:",bathrooms,"\n","Grade: ",
                                        grade ,"\n","Condition: ",condition,"\n","Year Built:",yr_built))%>%
  add_markers()%>%
  layout(title="Price vs Bedrooms")

Correlation plot

library(corrplot,quietly = T)
set.seed(990)
Nindex<-sapply(hd,is.numeric)
Ndata<-hd[,..Nindex]
corrplot(cor(Ndata))

4. Data preparation

a. Factorizing

Floors, bathrooms, waterfront and view consist of clear categories. That is why their numeric datatype is changed in to the factorial one.

b. Binarizing

sqft_basement and yr_renovated showed a clear distribution in the EDA, with most of the values being 0. With an ifelse condition all other variables are therefore changed into 1.

# Factorization
hd$floors <- sapply(hd$floors, as.factor)
hd$bathrooms <- sapply(hd$bathrooms, as.factor)
hd$waterfront <- sapply(hd$waterfront, as.factor)
hd$view <- sapply(hd$view, as.factor)

# Binarizing sqft_basement and yr_renovated
hd[,"sqft_basement"] <- ifelse(hd[,"sqft_basement"] > 0,1,0)
hd[,"yr_renovated"] <- ifelse(hd[,"yr_renovated"] > 0,1,0)

# Split data table again
hdTrain <- hd[which(hd$split == "train"),]
hdTest <- hd[which(hd$split == "test"),] # do not touch anymore

5. Modelling

Linear regression, random forest and gradient boosting are the models chosen to predict the house prices. There will be three iterations, with in between a Feature Engineering step, aiming to increase the score. In the last modeling phase, the parameters of the two latter models are also tuned.

Splitting function

set.seed(1996)
spliting <- sample(1:3, size = nrow(hdTrain), prob = c(0.7, 0.15, 0.15), replace = TRUE)

a. Baseline

formula = price~.

fitControl <- trainControl(method="cv",number = 10)

# Train, validation and testset to score and evaluate models
train0 <- hdTrain[spliting == 1, 3:21]
val0 <- hdTrain[spliting == 2, 3:21]
test0 <- hdTrain[spliting == 3, 3:21]

#Baseline lm
lm_0<-train(formula,data = train0, method = 'lm', trControl = fitControl, metric = 'MAE')
summary(lm_0)

Call:
lm(formula = .outcome ~ ., data = dat)

Residuals:
     Min       1Q   Median       3Q      Max 
-2597285   -91689   -10263    74977  2597285 

Coefficients: (3 not defined because of singularities)
                   Estimate    Std. Error t value             Pr(>|t|)    
(Intercept)   5672080.02461 3673239.82609   1.544             0.122575    
bedrooms       -20261.23608    2317.01421  -8.745 < 0.0000000000000002 ***
bathrooms2.5   -10275.91912    8491.23600  -1.210             0.226234    
bathrooms3.5    93255.15211   13176.79856   7.077    0.000000000001553 ***
bathrooms1.75  -13491.43569    8147.43885  -1.656             0.097766 .  
bathrooms3.25  122454.78649   13631.80362   8.983 < 0.0000000000000002 ***
bathrooms2.25   -2413.03418    9043.53821  -0.267             0.789609    
bathrooms3      25036.78812   11983.55101   2.089             0.036705 *  
bathrooms2.75  -12857.94320   10679.17843  -1.204             0.228606    
bathrooms1      15547.93530    8148.58804   1.908             0.056408 .  
bathrooms2       3186.93121    8815.73249   0.362             0.717728    
bathrooms6     544907.85816  108706.00957   5.013    0.000000544495624 ***
bathrooms3.75  228328.16459   23085.22153   9.891 < 0.0000000000000002 ***
bathrooms4.5   112213.97108   28787.63076   3.898    0.000097524547988 ***
bathrooms4     228051.59818   23433.40706   9.732 < 0.0000000000000002 ***
bathrooms0.75   50206.97371   29053.68337   1.728             0.084000 .  
bathrooms4.75  597666.87828   46943.33273  12.732 < 0.0000000000000002 ***
bathrooms4.25  390613.35912   33039.04635  11.823 < 0.0000000000000002 ***
bathrooms1.25    6847.75524   83176.83066   0.082             0.934388    
bathrooms5.75 -227483.81187  132360.41085  -1.719             0.085701 .  
bathrooms8    2263048.69293  137157.70092  16.500 < 0.0000000000000002 ***
bathrooms5     403308.10304   57236.24946   7.046    0.000000000001938 ***
bathrooms7.5             NA            NA      NA                   NA    
bathrooms5.5  1255545.45895   84762.17486  14.813 < 0.0000000000000002 ***
bathrooms5.25  712165.06110   71257.97046   9.994 < 0.0000000000000002 ***
bathrooms7.75 4516039.97034  188056.66270  24.014 < 0.0000000000000002 ***
bathrooms6.5             NA            NA      NA                   NA    
bathrooms6.25 1115926.00483  133055.86949   8.387 < 0.0000000000000002 ***
bathrooms0.5   -28780.05960  131019.20595  -0.220             0.826137    
bathrooms6.75            NA            NA      NA                   NA    
sqft_living       101.88086       7.93584  12.838 < 0.0000000000000002 ***
sqft_lot            0.14673       0.05936   2.472             0.013451 *  
floors3         39735.26520   11969.00783   3.320             0.000903 ***
floors2         13370.30343    5611.58725   2.383             0.017205 *  
floors1.5       12502.68765    6785.11368   1.843             0.065403 .  
floors2.5      134941.29037   20553.06677   6.566    0.000000000054008 ***
floors3.5      560374.86994  131469.47077   4.262    0.000020380288885 ***
waterfront1    490125.67867   26134.82596  18.754 < 0.0000000000000002 ***
view2           62227.70004    8460.65740   7.355    0.000000000000204 ***
view1          123239.22798   14942.50231   8.248 < 0.0000000000000002 ***
view3          163590.72319   11501.91721  14.223 < 0.0000000000000002 ***
view4          285326.68972   19449.24522  14.670 < 0.0000000000000002 ***
condition       34662.63168    2940.93590  11.786 < 0.0000000000000002 ***
grade          104056.48514    2701.02748  38.525 < 0.0000000000000002 ***
sqft_above         25.09191       8.73222   2.873             0.004067 ** 
sqft_basement   11887.41879    6695.66593   1.775             0.075859 .  
yr_built        -2054.81240      98.78115 -20.802 < 0.0000000000000002 ***
yr_renovated    63234.78132    9007.77260   7.020    0.000000000002339 ***
zipcode          -535.94144      41.05424 -13.054 < 0.0000000000000002 ***
lat            583213.88079   13288.30428  43.889 < 0.0000000000000002 ***
long          -184185.24693   16447.59371 -11.198 < 0.0000000000000002 ***
sqft_living15      44.21389       4.34594  10.174 < 0.0000000000000002 ***
sqft_lot15         -0.41753       0.08736  -4.779    0.000001781370153 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 184900 on 11920 degrees of freedom
Multiple R-squared:  0.7436,    Adjusted R-squared:  0.7426 
F-statistic: 705.7 on 49 and 11920 DF,  p-value: < 0.00000000000000022
pred_lm_0<-predict(lm_0, newdata = test0)

mape_lm_0<-mape(real=test0$price, predicted = pred_lm_0)
mape_lm_0
[1] 0.2438174
#Baseline rf
rf_0<-randomForest(formula, data = train0, ntree = 100)
summary(rf_0)
                Length Class  Mode     
call                4  -none- call     
type                1  -none- character
predicted       11970  -none- numeric  
mse               100  -none- numeric  
rsq               100  -none- numeric  
oob.times       11970  -none- numeric  
importance         18  -none- numeric  
importanceSD        0  -none- NULL     
localImportance     0  -none- NULL     
proximity           0  -none- NULL     
ntree               1  -none- numeric  
mtry                1  -none- numeric  
forest             11  -none- list     
coefs               0  -none- NULL     
y               11970  -none- numeric  
test                0  -none- NULL     
inbag               0  -none- NULL     
terms               3  terms  call     
varImpPlot(rf_0,type=2)

pred_rf_0<-predict(rf_0, newdata = test0)

mape_rf_0<-mape(real=test0$price, predicted = pred_rf_0)
mape_rf_0
[1] 0.1309606
#Plotting the actual and predicted
ggplot(test0,aes(x=price,y=pred_rf_0))+geom_point()+geom_abline(color="blue")

#Baseline GBM
gb_0 <- gbm(formula,data = train0,n.trees = 100,cv.folds = 5)
Distribution not specified, assuming gaussian ...
summary(gb_0)

                        var    rel.inf
sqft_living     sqft_living 40.1830656
grade                 grade 27.8547928
lat                     lat 16.0301247
view                   view  6.5117737
bathrooms         bathrooms  5.2482973
waterfront       waterfront  2.7416587
sqft_living15 sqft_living15  0.5947020
yr_built           yr_built  0.3121457
yr_renovated   yr_renovated  0.2769282
condition         condition  0.1252483
long                   long  0.1212630
bedrooms           bedrooms  0.0000000
sqft_lot           sqft_lot  0.0000000
floors               floors  0.0000000
sqft_above       sqft_above  0.0000000
sqft_basement sqft_basement  0.0000000
zipcode             zipcode  0.0000000
sqft_lot15       sqft_lot15  0.0000000
pred_gb_0<-predict(gb_0, newdata = test0)
Using 100 trees...
mape_gb_0<-mape(real=test0$price, predicted = pred_gb_0)
mape_gb_0
[1] 0.206808

Feature Engineering 1: extract date information and add it to the datatable

#date transformations and variable extractions
hd <- cbind(hd, hd$date) #copy of date column
hd$date <- mdy(hd$date)
hd$date <-as.numeric(as.Date(hd$date, origin = "1900-01-01"))
colnames(hd)[colnames(hd)=="V2"] <- "copy_date"
hd$copy_date <- as.Date(hd$copy_date, format = '%m/%d/%Y')

hd$month <- month(hd$copy_date)
hd$day <- day(hd$copy_date)
hd$year <- year(hd$copy_date)

unique(hd$year) # check years
[1] 2014 2015
str(hd)
Classes 'data.table' and 'data.frame':  21420 obs. of  26 variables:
 $ id           :integer64 9183703376 464000600 2224079050 6163901283 6392003810 7974200948 2426059124 2115510300 ... 
 $ date         : num  16203 16309 16269 16465 16213 ...
 $ price        : num  225000 641250 810000 330000 530000 ...
 $ bedrooms     : int  3 3 4 4 4 4 4 3 4 3 ...
 $ bathrooms    : Factor w/ 29 levels "1.5","2.5","3.5",..: 1 2 3 1 4 3 5 6 2 1 ...
 $ sqft_living  : int  1250 2220 3980 1890 1814 3120 4160 1440 2250 2540 ...
 $ sqft_lot     : int  7500 2550 209523 7540 5000 5086 47480 10500 6840 9520 ...
 $ floors       : Factor w/ 6 levels "1","3","2","1.5",..: 1 2 3 1 1 3 3 1 3 1 ...
 $ waterfront   : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
 $ view         : Factor w/ 5 levels "0","2","1","3",..: 1 2 2 1 1 1 1 1 1 1 ...
 $ condition    : int  3 3 3 4 4 3 3 3 3 3 ...
 $ grade        : int  7 10 9 7 7 9 10 8 9 8 ...
 $ sqft_above   : int  1250 2220 3980 1890 944 2480 4160 1130 2250 1500 ...
 $ sqft_basement: num  0 0 0 0 1 1 0 1 0 1 ...
 $ yr_built     : int  1967 1990 2006 1967 1951 2008 1995 1983 1987 1959 ...
 $ yr_renovated : num  0 0 0 0 0 0 0 0 0 0 ...
 $ zipcode      : int  98030 98117 98024 98155 98115 98115 98072 98023 98058 98115 ...
 $ lat          : num  47.4 47.7 47.6 47.8 47.7 ...
 $ long         : num  -122 -122 -122 -122 -122 ...
 $ sqft_living15: int  1260 2200 2220 1890 1290 1880 3400 1510 2480 1870 ...
 $ sqft_lot15   : int  7563 5610 65775 8515 5000 5092 40428 8125 7386 6800 ...
 $ split        : chr  "train" "train" "train" "train" ...
 $ copy_date    : Date, format: "2014-05-13" "2014-08-27" ...
 $ month        : num  5 8 7 1 5 5 12 10 11 7 ...
 $ day          : int  13 27 18 30 23 20 16 16 26 21 ...
 $ year         : num  2014 2014 2014 2015 2014 ...
 - attr(*, ".internal.selfref")=<externalptr> 
hdTrain <- hd[which(hd$split == "train"),]
hdTest <- hd[which(hd$split == "test"),] # do not touch anymore

b. Modeling phase 2

# Train, validation and testset to score and evaluate models
train1 <- hdTrain[spliting == 1, c(3:21, 24:26)]
val1 <- hdTrain[spliting == 2, c(3:21, 24:26)]
test1 <- hdTrain[spliting == 3, c(3:21, 24:26)]

#LM with FE1
lm_1<-train(formula,data = train1, method = 'lm', trControl = fitControl, metric = 'MAE')
summary(lm_1)

Call:
lm(formula = .outcome ~ ., data = dat)

Residuals:
     Min       1Q   Median       3Q      Max 
-2590561   -91217    -9452    74462  2590561 

Coefficients: (3 not defined because of singularities)
                     Estimate      Std. Error t value             Pr(>|t|)
(Intercept)   -73926807.59095  12345532.56763  -5.988 0.000000002183665464
bedrooms         -20293.68251      2310.59099  -8.783 < 0.0000000000000002
bathrooms2.5     -10252.83910      8467.94690  -1.211             0.226003
bathrooms3.5      92693.56029     13139.59505   7.055 0.000000000001827827
bathrooms1.75    -13726.84965      8125.25931  -1.689             0.091168
bathrooms3.25    123511.48845     13593.72820   9.086 < 0.0000000000000002
bathrooms2.25     -3342.09419      9018.52522  -0.371             0.710956
bathrooms3        24938.58270     11951.19968   2.087             0.036936
bathrooms2.75    -12247.95942     10649.37764  -1.150             0.250122
bathrooms1        15150.94520      8125.93821   1.865             0.062274
bathrooms2         3194.25405      8791.17712   0.363             0.716352
bathrooms6       532937.90633    108406.79214   4.916 0.000000894645045081
bathrooms3.75    228155.90041     23020.09153   9.911 < 0.0000000000000002
bathrooms4.5     110721.66842     28709.63832   3.857             0.000116
bathrooms4       223536.92199     23374.20011   9.563 < 0.0000000000000002
bathrooms0.75     50869.20248     28973.50922   1.756             0.079163
bathrooms4.75    599123.83009     46817.18496  12.797 < 0.0000000000000002
bathrooms4.25    388695.07208     32946.32182  11.798 < 0.0000000000000002
bathrooms1.25      6717.72519     82958.91020   0.081             0.935462
bathrooms5.75   -228845.63104    132021.49247  -1.733             0.083051
bathrooms8      2271127.19348    136783.78612  16.604 < 0.0000000000000002
bathrooms5       409644.55368     57086.56495   7.176 0.000000000000761208
bathrooms7.5               NA              NA      NA                   NA
bathrooms5.5    1262005.49323     84527.07936  14.930 < 0.0000000000000002
bathrooms5.25    722112.89345     71066.19646  10.161 < 0.0000000000000002
bathrooms7.75   4522574.12325    187524.59229  24.117 < 0.0000000000000002
bathrooms6.5               NA              NA      NA                   NA
bathrooms6.25   1126217.14029    132712.59122   8.486 < 0.0000000000000002
bathrooms0.5     -33152.48902    130657.29507  -0.254             0.799704
bathrooms6.75              NA              NA      NA                   NA
sqft_living         101.99207         7.91331  12.889 < 0.0000000000000002
sqft_lot              0.13805         0.05920   2.332             0.019728
floors3           40270.76650     11937.22172   3.374             0.000744
floors2           13896.05520      5596.37502   2.483             0.013040
floors1.5         12854.88457      6766.29544   1.900             0.057478
floors2.5        136233.94943     20496.68231   6.647 0.000000000031294489
floors3.5        569758.27569    131108.18989   4.346 0.000013997559685484
waterfront1      486717.04852     26064.57096  18.674 < 0.0000000000000002
view2             61637.14859      8437.29441   7.305 0.000000000000294234
view1            122091.41205     14901.82220   8.193 0.000000000000000281
view3            162973.27565     11470.41942  14.208 < 0.0000000000000002
view4            285625.04663     19397.49464  14.725 < 0.0000000000000002
condition         36242.42727      2938.66774  12.333 < 0.0000000000000002
grade            104298.35361      2694.51130  38.708 < 0.0000000000000002
sqft_above           24.78066         8.70840   2.846             0.004440
sqft_basement     11709.04984      6676.75481   1.754             0.079507
yr_built          -2049.30921        98.50793 -20.803 < 0.0000000000000002
yr_renovated      65931.97411      8990.09241   7.334 0.000000000000238116
zipcode            -535.05506        40.94193 -13.069 < 0.0000000000000002
lat              585386.41342     13253.34701  44.169 < 0.0000000000000002
long            -183812.68398     16407.15088 -11.203 < 0.0000000000000002
sqft_living15        44.72548         4.33422  10.319 < 0.0000000000000002
sqft_lot15           -0.40804         0.08713  -4.683 0.000002853429975583
month              1898.46611       875.22038   2.169             0.030093
day                  17.30619       196.16046   0.088             0.929700
year              39428.97603      5841.33570   6.750 0.000000000015470838
                 
(Intercept)   ***
bedrooms      ***
bathrooms2.5     
bathrooms3.5  ***
bathrooms1.75 .  
bathrooms3.25 ***
bathrooms2.25    
bathrooms3    *  
bathrooms2.75    
bathrooms1    .  
bathrooms2       
bathrooms6    ***
bathrooms3.75 ***
bathrooms4.5  ***
bathrooms4    ***
bathrooms0.75 .  
bathrooms4.75 ***
bathrooms4.25 ***
bathrooms1.25    
bathrooms5.75 .  
bathrooms8    ***
bathrooms5    ***
bathrooms7.5     
bathrooms5.5  ***
bathrooms5.25 ***
bathrooms7.75 ***
bathrooms6.5     
bathrooms6.25 ***
bathrooms0.5     
bathrooms6.75    
sqft_living   ***
sqft_lot      *  
floors3       ***
floors2       *  
floors1.5     .  
floors2.5     ***
floors3.5     ***
waterfront1   ***
view2         ***
view1         ***
view3         ***
view4         ***
condition     ***
grade         ***
sqft_above    ** 
sqft_basement .  
yr_built      ***
yr_renovated  ***
zipcode       ***
lat           ***
long          ***
sqft_living15 ***
sqft_lot15    ***
month         *  
day              
year          ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 184400 on 11917 degrees of freedom
Multiple R-squared:  0.7452,    Adjusted R-squared:  0.7441 
F-statistic: 670.1 on 52 and 11917 DF,  p-value: < 0.00000000000000022
pred_lm_1<-predict(lm_1, newdata = test1)

mape_lm_1<-mape(real=test1$price, predicted = pred_lm_1)
mape_lm_1
[1] 0.2441572
#RF with FE1
rf_1<-randomForest(formula, data = train1, ntree = 100)
summary(rf_1)
                Length Class  Mode     
call                4  -none- call     
type                1  -none- character
predicted       11970  -none- numeric  
mse               100  -none- numeric  
rsq               100  -none- numeric  
oob.times       11970  -none- numeric  
importance         21  -none- numeric  
importanceSD        0  -none- NULL     
localImportance     0  -none- NULL     
proximity           0  -none- NULL     
ntree               1  -none- numeric  
mtry                1  -none- numeric  
forest             11  -none- list     
coefs               0  -none- NULL     
y               11970  -none- numeric  
test                0  -none- NULL     
inbag               0  -none- NULL     
terms               3  terms  call     
varImpPlot(rf_1,type=2)

pred_rf_1<-predict(rf_1, newdata = test1)

mape_rf_1<-mape(real=test1$price, predicted = pred_rf_1)
mape_rf_1
[1] 0.1320853
#Plotting the actual and predicted
ggplot(test1,aes(x=price,y=pred_rf_1))+geom_point()+geom_abline(color="blue")

#GBM with FE 1
gb_1 <- gbm(formula,data = train1,n.trees = 100,cv.folds = 5)
Distribution not specified, assuming gaussian ...
summary(gb_1)

                        var    rel.inf
sqft_living     sqft_living 39.1879971
grade                 grade 27.6046698
lat                     lat 15.2434675
view                   view  7.2956360
bathrooms         bathrooms  5.4210370
waterfront       waterfront  2.8424284
sqft_above       sqft_above  1.2279090
yr_built           yr_built  0.3977170
sqft_living15 sqft_living15  0.3235966
yr_renovated   yr_renovated  0.2050863
long                   long  0.1299746
condition         condition  0.1204806
bedrooms           bedrooms  0.0000000
sqft_lot           sqft_lot  0.0000000
floors               floors  0.0000000
sqft_basement sqft_basement  0.0000000
zipcode             zipcode  0.0000000
sqft_lot15       sqft_lot15  0.0000000
month                 month  0.0000000
day                     day  0.0000000
year                   year  0.0000000
pred_gb_1<-predict(gb_1, newdata = test1)
Using 100 trees...
mape_gb_1<-mape(real=test1$price, predicted = pred_gb_1)
mape_gb_1
[1] 0.2088095

Feature Engineering 2: delete as seen in the EDA, the outlier of 33 bedrooms with a relatively low price

#Remove outlier
plot(table(train1$bedrooms))

train2 <- subset(train1, bedrooms < 12)

hdTrain <- hd[which(hd$split == "train"),]
hdTest <- hd[which(hd$split == "test"),] # do not touch anymore

c. Modeling phase 3

Experimenting with parameter tuning for RF and GBM

#LM with FE2
lm_2<-train(formula,data = train2, method = 'lm', trControl = fitControl, metric = 'MAE')
summary(lm_2)

Call:
lm(formula = .outcome ~ ., data = dat)

Residuals:
     Min       1Q   Median       3Q      Max 
-2588982   -91049    -9501    74416  2588982 

Coefficients: (3 not defined because of singularities)
                     Estimate      Std. Error t value             Pr(>|t|)
(Intercept)   -74092703.92127  12336535.48950  -6.006 0.000000001957517727
bedrooms         -24289.78034      2488.76201  -9.760 < 0.0000000000000002
bathrooms2.5      -9166.69310      8465.50049  -1.083              0.27891
bathrooms3.5      94438.38489     13136.21857   7.189 0.000000000000690856
bathrooms1.75    -13910.04052      8119.40985  -1.713              0.08670
bathrooms3.25    124918.49031     13587.69226   9.194 < 0.0000000000000002
bathrooms2.25     -2510.93576      9013.97972  -0.279              0.78059
bathrooms3        27141.60274     11953.40705   2.271              0.02319
bathrooms2.75    -10196.26251     10652.24734  -0.957              0.33849
bathrooms1        13806.97645      8125.98471   1.699              0.08932
bathrooms2         3683.11015      8785.46245   0.419              0.67506
bathrooms6       531362.05408    108327.87840   4.905 0.000000945999008011
bathrooms3.75    230066.25119     23007.48895  10.000 < 0.0000000000000002
bathrooms4.5     113787.00639     28697.42361   3.965 0.000073806283397607
bathrooms4       226333.23001     23366.09519   9.686 < 0.0000000000000002
bathrooms0.75     47377.18583     28963.63056   1.636              0.10192
bathrooms4.75    600890.49191     46784.63988  12.844 < 0.0000000000000002
bathrooms4.25    389574.79487     32922.78575  11.833 < 0.0000000000000002
bathrooms1.25      4778.78898     82899.27235   0.058              0.95403
bathrooms5.75   -224831.35160    131927.93466  -1.704              0.08837
bathrooms8      2262020.76940    136699.82602  16.547 < 0.0000000000000002
bathrooms5       413504.33237     57051.73914   7.248 0.000000000000449472
bathrooms7.5               NA              NA      NA                   NA
bathrooms5.5    1259575.11992     84466.95516  14.912 < 0.0000000000000002
bathrooms5.25    726190.24685     71020.38355  10.225 < 0.0000000000000002
bathrooms7.75   4519602.87734    187388.28697  24.119 < 0.0000000000000002
bathrooms6.5               NA              NA      NA                   NA
bathrooms6.25   1121678.93942    132619.42210   8.458 < 0.0000000000000002
bathrooms0.5     -36744.20452    130564.10744  -0.281              0.77839
bathrooms6.75              NA              NA      NA                   NA
sqft_living         105.00381         7.93843  13.227 < 0.0000000000000002
sqft_lot              0.13422         0.05917   2.269              0.02331
floors3           39335.84351     11930.44366   3.297              0.00098
floors2           13521.47760      5592.94708   2.418              0.01564
floors1.5         13927.96315      6765.93134   2.059              0.03956
floors2.5        136549.57184     20481.77624   6.667 0.000000000027282846
floors3.5        571633.35175    131012.72688   4.363 0.000012925838185840
waterfront1      485281.35436     26047.58678  18.631 < 0.0000000000000002
view2             61003.21939      8432.39213   7.234 0.000000000000496278
view1            121525.90246     14891.46968   8.161 0.000000000000000366
view3            162127.36096     11463.69079  14.143 < 0.0000000000000002
view4            284617.89446     19384.67750  14.683 < 0.0000000000000002
condition         36062.66535      2936.80909  12.280 < 0.0000000000000002
grade            103698.25759      2696.14578  38.462 < 0.0000000000000002
sqft_above           24.16392         8.70320   2.776              0.00550
sqft_basement     11175.43353      6673.00944   1.675              0.09402
yr_built          -2056.24844        98.44888 -20.886 < 0.0000000000000002
yr_renovated      65506.76965      8984.04056   7.291 0.000000000000326009
zipcode            -537.89825        40.91723 -13.146 < 0.0000000000000002
lat              584237.10727     13246.31825  44.106 < 0.0000000000000002
long            -184081.47388     16395.23277 -11.228 < 0.0000000000000002
sqft_living15        44.71400         4.33104  10.324 < 0.0000000000000002
sqft_lot15           -0.41375         0.08707  -4.752 0.000002039470433147
month              1924.09057       874.59855   2.200              0.02783
day                   5.99149       196.03419   0.031              0.97562
year              39674.10655      5837.32832   6.797 0.000000000011220734
                 
(Intercept)   ***
bedrooms      ***
bathrooms2.5     
bathrooms3.5  ***
bathrooms1.75 .  
bathrooms3.25 ***
bathrooms2.25    
bathrooms3    *  
bathrooms2.75    
bathrooms1    .  
bathrooms2       
bathrooms6    ***
bathrooms3.75 ***
bathrooms4.5  ***
bathrooms4    ***
bathrooms0.75    
bathrooms4.75 ***
bathrooms4.25 ***
bathrooms1.25    
bathrooms5.75 .  
bathrooms8    ***
bathrooms5    ***
bathrooms7.5     
bathrooms5.5  ***
bathrooms5.25 ***
bathrooms7.75 ***
bathrooms6.5     
bathrooms6.25 ***
bathrooms0.5     
bathrooms6.75    
sqft_living   ***
sqft_lot      *  
floors3       ***
floors2       *  
floors1.5     *  
floors2.5     ***
floors3.5     ***
waterfront1   ***
view2         ***
view1         ***
view3         ***
view4         ***
condition     ***
grade         ***
sqft_above    ** 
sqft_basement .  
yr_built      ***
yr_renovated  ***
zipcode       ***
lat           ***
long          ***
sqft_living15 ***
sqft_lot15    ***
month         *  
day              
year          ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 184300 on 11916 degrees of freedom
Multiple R-squared:  0.7456,    Adjusted R-squared:  0.7444 
F-statistic: 671.5 on 52 and 11916 DF,  p-value: < 0.00000000000000022
pred_lm_2<-predict(lm_2, newdata = test1)

mape_lm_2<-mape(real=test1$price, predicted = pred_lm_2)
mape_lm_2
[1] 0.2441382
#RF with FE2 and parameter tuning
rf_2<-randomForest(formula, data = train2, ntree = 200, mtry = 4, nodesize = 7)
summary(rf_2)
                Length Class  Mode     
call                6  -none- call     
type                1  -none- character
predicted       11969  -none- numeric  
mse               200  -none- numeric  
rsq               200  -none- numeric  
oob.times       11969  -none- numeric  
importance         21  -none- numeric  
importanceSD        0  -none- NULL     
localImportance     0  -none- NULL     
proximity           0  -none- NULL     
ntree               1  -none- numeric  
mtry                1  -none- numeric  
forest             11  -none- list     
coefs               0  -none- NULL     
y               11969  -none- numeric  
test                0  -none- NULL     
inbag               0  -none- NULL     
terms               3  terms  call     
varImpPlot(rf_2,type=2)

pred_rf_2<-predict(rf_2, newdata = test1)

mape_rf_2<-mape(real=test1$price, predicted = pred_rf_2)
mape_rf_2
[1] 0.1384995
#Plotting the actual and predicted
ggplot(test1,aes(x=price,y=pred_rf_2))+geom_point()+geom_abline(color="blue")

#GBM with FE2 and parameter tuning
gb_2 <- gbm(formula,data = train2,n.trees = 200, interaction.depth = 2, shrinkage = 0.08, cv.folds = 5)
Distribution not specified, assuming gaussian ...
summary(gb_2)

                        var     rel.inf
sqft_living     sqft_living 30.65771949
grade                 grade 25.61927821
lat                     lat 14.16426632
bathrooms         bathrooms  8.40253201
view                   view  6.14925684
long                   long  4.91186932
sqft_living15 sqft_living15  2.89843267
waterfront       waterfront  2.12003468
yr_built           yr_built  1.54577963
zipcode             zipcode  1.26608828
sqft_above       sqft_above  1.18598924
condition         condition  0.37171398
yr_renovated   yr_renovated  0.21827926
sqft_lot           sqft_lot  0.12753421
sqft_lot15       sqft_lot15  0.10923666
month                 month  0.09220563
year                   year  0.07566302
bedrooms           bedrooms  0.04141828
day                     day  0.02255447
floors               floors  0.02014782
sqft_basement sqft_basement  0.00000000
pred_gb_2<-predict(gb_2, newdata = test1)
Using 196 trees...
mape_gb_2<-mape(real=test1$price, predicted = pred_gb_2)
mape_gb_2
[1] 0.169993

Parameter tuning decreased the score of the gradient boosting model drastically, but the random forest score became worse.

Extract best score

table <- c(mape_lm_0,mape_lm_1,mape_lm_2,
           mape_rf_0, mape_rf_1, mape_rf_2,
           mape_gb_0, mape_gb_1, mape_gb_2)

best_score <- min(table)

MAPE rf_0: 0.1309606

The random forest MAPE of our baseline model is the best score

6. Score Testset

With the randomforest baseline model, the test set is scored and the prediction file saved.

final_test<-predict(rf_0, newdata = hdTest)
hdTest$price <- final_test
Gerald_Walravens_Project <- hdTest[, c(1, 3)]
write.csv(Gerald_Walravens_Project, "Gerald_Walravens_Project.csv")